This is a report of the modular analysis on the EKD11 transcriptomics dataset/

I have used the Singhania/O’Garra blood modules, using Qusage to quantify the relative encrihment of the modules.

Qusage will assess the mean expression of all genes in a geneset (module) within a treatment group and compare against a control group, and output the log2Fold Enrichment of for each module.

Setting directories for Data

This will load the necessarys packages for the analysis, as well as load the module data.

This analysis requires us to have loaded the samples into DESeq2 to identify the genes found to be differentially expressed (DE) in each comparison, as well as for a log transformed normalised count matrix (normalised to library size).

rm(list = ls())

# Libraries
library(DESeq2)
library(ComplexHeatmap)
library(tidyverse)
library(qusage)
library(ggplot2)
library(reshape2)
library(RColorBrewer)
library(scales)

nf.dir      <- "/Users/alderc/1-projects/CAMP/1-AS_timecourse/2-EKD11/1-Pipeline/1-Nextflow/"
work.dir    <- "/Users/alderc/1-projects/1-PIR_project/3-EKD11_AS_Early_TP/"
r.dir       <- paste(work.dir, "1-R/",sep='') #change once refactor is completed
tmp.dir     <- paste(work.dir,"tmp/",sep='')
data.dir    <- paste(work.dir,"4-data/",sep='')
results.dir <- paste(work.dir,"2-results_2020/",sep='')
genome.dir  <- "/Users/alderc/1-projects/9-Data/1-Reference_genomes/1-Mus_musculus/"


module.anno <- read.csv(file=paste(data.dir, '/modular_analysis/Annotation of Blood modules.csv', sep=''),
                        header=TRUE,
                        sep=',');
module.dat <- read.csv(file=paste(data.dir, '/modular_analysis/Blood modules.csv', sep=''),
                       header=TRUE,
                       sep=',');

count.dat <- read.table(file= paste(results.dir, 'genes.vst_counts_with_controls.xls.gz', sep=''),
                       header=TRUE,
                       sep='\t');
# Load DESeq results list
load(paste(r.dir, "Projects/results.Rdata", sep = ""))

deg.list <- lapply(res.list, function(x){
  x <- x[x$DEG, ]
})


#List of Module IDs
module.num <- module.anno$X
print(module.anno[, -2])
##      X Number.of.genes                             Biological.process
## 1   B1             169                            Ribosome biogenesis
## 2   B2              53                             Ribosome functions
## 3   B3              82         Mitochondria/Oxidative phosphorylation
## 4   B4             732               Myeloid cells/Membrane functions
## 5   B5             144                 Macrophages/Cytokine signaling
## 6   B6             196                 Myeloid cells/Lipid metabolism
## 7   B7             349                     Myeloid cells/Granulocytes
## 8   B8             381       Oxidative phosphorylation/Metabolism/ATP
## 9   B9             408                 Ribosome/Mitochondrial activty
## 10 B10             395                     Cell cycle/DNA replication
## 11 B11             134                             IFN signaling/Il10
## 12 B12             267                      Oxidative phosphorylation
## 13 B13             301 Stress response/Protein folding/Ubiquitination
## 14 B14             172                                  IFN signaling
## 15 B15              81              Cytotoxic/T cells/ILC/Tbx21/Eomes
## 16 B16              82                                   Granulocytes
## 17 B17             220                     Myeloid cells/Granulocytes
## 18 B18             367                     Myeloid cells/Granulocytes
## 19 B19              60                             T cells/Leukocytes
## 20 B20             126                 Cytoskeleton/Calcium signaling
## 21 B21             254            Drug metabolism/Protease inhibitors
## 22 B22              30                                     Metabolism
## 23 B23             134            Drug metabolism/Protease inhibitors
## 24 B24             102              Tissue factor signaling in cancer
## 25 B25             123                          Miscellaneous enzymes
## 26 B26             236               Miscellaneous Adhesion/Signaling
## 27 B27             279                       Epithelial cell function
## 28 B28             264                B cell signaling anddevelopment
## 29 B29             116                B cell signaling anddevelopment
## 30 B30             207                       B cell/Myeloid signaling
## 31 B31             631                          RNA and DNA processes
## 32 B32             557                           Histone modification
## 33 B33             182                     Hormone receptor signaling
## 34 B34             458            Autophagy/Erythrocyte function/Heme
## 35 B35             412                   Integrins/Platelets/Histones
## 36 B36             209                                  Miscellaneous
## 37 B37             198                         T cell differentiation
## 38 B38             252                  B cells/T cells and signaling
## 39 B39             356                            Leukocyte signaling
## 40 B40              95              Cytoskeleton/Actin-based motility
## 41 B41             154                       T cells/T cell signaling

Transforming data to create Modules into Geneset format for Qusage

module.geneset <- lapply(module.num, function(mod){
  dat <- module.dat[module.dat$Module == mod, 'X']; 
  dat <- dat[!is.na(dat)]
  mod <- dat
})

names(module.geneset) <- paste(module.num, module.anno$Biological.process)


#Check to see if Merge was correct
for (mod in names(module.geneset)){
  name <- str_split(mod, " ")[[1]]
  anno <- as.numeric(module.anno[module.anno$X == name[1], 'Number.of.genes']);
  if (length(module.geneset[[mod]]) == anno){
    print(paste("Geneset", mod, 'contains correct number of genes:', anno, sep=' '))
  } else {
    print('Module gene number does not match')
  }
}
## [1] "Geneset B1 Ribosome biogenesis contains correct number of genes: 169"
## [1] "Geneset B2 Ribosome functions contains correct number of genes: 53"
## [1] "Geneset B3 Mitochondria/Oxidative phosphorylation contains correct number of genes: 82"
## [1] "Geneset B4 Myeloid cells/Membrane functions contains correct number of genes: 732"
## [1] "Geneset B5 Macrophages/Cytokine signaling contains correct number of genes: 144"
## [1] "Geneset B6 Myeloid cells/Lipid metabolism contains correct number of genes: 196"
## [1] "Geneset B7 Myeloid cells/Granulocytes contains correct number of genes: 349"
## [1] "Geneset B8 Oxidative phosphorylation/Metabolism/ATP contains correct number of genes: 381"
## [1] "Geneset B9 Ribosome/Mitochondrial activty contains correct number of genes: 408"
## [1] "Geneset B10 Cell cycle/DNA replication contains correct number of genes: 395"
## [1] "Geneset B11 IFN signaling/Il10 contains correct number of genes: 134"
## [1] "Geneset B12 Oxidative phosphorylation contains correct number of genes: 267"
## [1] "Geneset B13 Stress response/Protein folding/Ubiquitination contains correct number of genes: 301"
## [1] "Geneset B14 IFN signaling contains correct number of genes: 172"
## [1] "Geneset B15 Cytotoxic/T cells/ILC/Tbx21/Eomes contains correct number of genes: 81"
## [1] "Geneset B16 Granulocytes contains correct number of genes: 82"
## [1] "Geneset B17 Myeloid cells/Granulocytes contains correct number of genes: 220"
## [1] "Geneset B18 Myeloid cells/Granulocytes contains correct number of genes: 367"
## [1] "Geneset B19 T cells/Leukocytes contains correct number of genes: 60"
## [1] "Geneset B20 Cytoskeleton/Calcium signaling contains correct number of genes: 126"
## [1] "Geneset B21 Drug metabolism/Protease inhibitors contains correct number of genes: 254"
## [1] "Geneset B22 Metabolism contains correct number of genes: 30"
## [1] "Geneset B23 Drug metabolism/Protease inhibitors contains correct number of genes: 134"
## [1] "Geneset B24 Tissue factor signaling in cancer contains correct number of genes: 102"
## [1] "Geneset B25 Miscellaneous enzymes contains correct number of genes: 123"
## [1] "Geneset B26 Miscellaneous Adhesion/Signaling contains correct number of genes: 236"
## [1] "Geneset B27 Epithelial cell function contains correct number of genes: 279"
## [1] "Geneset B28 B cell signaling anddevelopment contains correct number of genes: 264"
## [1] "Geneset B29 B cell signaling anddevelopment contains correct number of genes: 116"
## [1] "Geneset B30 B cell/Myeloid signaling contains correct number of genes: 207"
## [1] "Geneset B31 RNA and DNA processes contains correct number of genes: 631"
## [1] "Geneset B32 Histone modification contains correct number of genes: 557"
## [1] "Geneset B33 Hormone receptor signaling contains correct number of genes: 182"
## [1] "Geneset B34 Autophagy/Erythrocyte function/Heme contains correct number of genes: 458"
## [1] "Geneset B35 Integrins/Platelets/Histones contains correct number of genes: 412"
## [1] "Geneset B36 Miscellaneous contains correct number of genes: 209"
## [1] "Geneset B37 T cell differentiation contains correct number of genes: 198"
## [1] "Geneset B38 B cells/T cells and signaling contains correct number of genes: 252"
## [1] "Geneset B39 Leukocyte signaling contains correct number of genes: 356"
## [1] "Geneset B40 Cytoskeleton/Actin-based motility contains correct number of genes: 95"
## [1] "Geneset B41 T cells/T cell signaling contains correct number of genes: 154"

Now we format the rlog data to the format we need

## Trimming data
rownames(count.dat) <- count.dat$gene_id
sample.cols <- grep(pattern = '^naive|d.\\.', x = names(count.dat), value = TRUE)
count.mat <- count.dat[ , sample.cols];

Modular Quantification

For this analysis, I have compared each transmission group separately against the control (i.e MT vs Naive, SBP vs Naive and RTMT vs Naive).

Overview

Briefly, the script will compare the treatment and control on a individual timepoint (e.g. Day 1), to calculate the fold enrichment of the module, once each module enrichment score is complete it will move onto the next timepoint until all calculations are completed. Qusage will output a log fold enrichment score for each module, as well as p-value for the comparison. Modules with p-value > 0.05 were removed from further analysis.

Finally, the script will look at all the genes within a module, and look to see whether they were DE within the similar comparisons done within DESeq2, and return the proportion of genes within the module that were DE.

# Load Qusage reults (If run already)
if(file.exists(paste(r.dir, "Projects/qusage_results.Rdata", sep = ""))){
  load(paste(r.dir, "Projects/qusage_results.Rdata", sep = ""))
}

MT vs Naive

mvn.cols <- grepl(pattern = "naive|^mt", x= names(count.mat));
mvn.mat <- count.mat[, mvn.cols];

if(!exists("results.mvn")){
  results.mvn <- lapply(c(1,2,3,4,6), function(d){
    cols <- grepl(pattern = paste("d", d ,"|naive", sep=""), x = names(mvn.mat))
    dat <- mvn.mat[, cols]
    print(names(dat))
    labels <- sapply(names(dat), function(x){
      ifelse(grepl(pattern = "mt", x = x), "mt","naive")
    })
    contrast = "mt-naive";
    qusage(dat, labels, contrast, module.geneset, n.points = 2^16)
  })
  names(results.mvn) <- c("Day1", "Day2", "Day3", "Day4", "Day6")
}

qusage.mvn <- lapply(names(results.mvn), function(x){
  df <- qsTable(results.mvn[[x]], number = 41)
  df$log.fold.change <- ifelse(df$p.Value < 0.05, df$log.fold.change, NA)
  df <- df[ ,c(1,2)]
  colnames(df) <- c("Module", paste(x, "_FC", sep = ""))
  df
})

qusage.mvn.all <- Reduce(function(x,y) merge(x = x, y = y, by = "Module", no.dups=T), 
       qusage.mvn)
qusage.mvn.all <- qusage.mvn.all[match(names(module.geneset), qusage.mvn.all$Module), ]
qusage.mvn.all$Module <- factor(qusage.mvn.all$Module, levels = qusage.mvn.all$Module)

qusage.mvn.melt <- melt(qusage.mvn.all, id.vars = "Module")
qusage.mvn.melt$prop <-apply(qusage.mvn.melt, 1, function(prop){
  if (!is.na(prop[["value"]])){
  mod <- prop[[1]]
  day <- as.numeric(str_match(string = prop[2], pattern = "[0-9]"))
  geneset <- module.geneset[[mod]]
  num.genes <- length(geneset)
  deg.name <- grep(pattern = paste('^mt\\.', day, '_vs_naive', sep='')  , x = names(deg.list), value = TRUE);
  deg <- deg.list[[deg.name]]
  deg <- deg[geneset, ] %>% drop_na()
  output <- nrow(deg) / num.genes} else {output <- NA}
  output
})

x <- ggplot(qusage.mvn.melt, aes(x = variable, y = reorder(Module, desc(Module)))) + 
  ggtitle('Mosquito Transmission') + xlab('Day') + ylab('Module') + 
  scale_x_discrete(labels= c(1,2,3,4,6)) +
  geom_count(aes(size = prop, colour = value), na.rm = TRUE) +
  labs(size = "Proportion of DEG in Module", colour = 'log2 Fold Change') +
  scale_color_gradientn(colours = rev(brewer.pal(11, "RdBu")),values = rescale(c(-0.5,0,1.25)), limits = c(-0.5,1.25)) +
  scale_size_area(max_size = 9) +
  theme_bw() +
  theme(panel.grid.minor = element_blank(),
  panel.background = element_rect(fill = 'white'),
  plot.background = element_blank(),
  legend.background = element_rect(fill = 'transparent'))

x

Results of the modular analysis of Mosquito transmission vs Control. Analysis was performed on all timepoint:module combinations and comparisons with a p-value < 0.05 were removed. The colour of the bubble indicated the intensity of fold change with red showing an upregulation and blue showing a downregulation. Size of the bubble indicates how many genes within a module were DE during that timepoint:module combination.

SBP vs Naive

svn.cols <- grepl(pattern = "naive|^sbp", x= names(count.mat));
svn.mat <- count.mat[, svn.cols];

if(!exists("results.svn")){
  results.svn <- lapply(c(1,2,3,4,6), function(d){
    cols <- grepl(pattern = paste("d", d ,"|naive", sep=""), x = names(svn.mat));
    dat <- svn.mat[, cols]
    print(names(dat))
    labels <- sapply(names(dat), function(x){
      ifelse(grepl(pattern = "sbp", x = x), "sbp","naive")
    })
    contrast = "sbp-naive";
    qusage(dat, labels, contrast, module.geneset, n.points = 2^16);
  })
  names(results.svn) <- c("Day1", "Day2", "Day3", "Day4", "Day6")
}

qusage.svn <- lapply(names(results.svn), function(x){
  df <- qsTable(results.svn[[x]], number = 41)
  df$log.fold.change <- ifelse(df$p.Value < 0.05, df$log.fold.change, NA)
  df <- df[ ,c(1,2)]
  colnames(df) <- c("Module", paste(x, "_FC", sep = ""))
  df
})

qusage.svn.all <- Reduce(function(x,y) merge(x = x, y = y, by = "Module", no.dups=T), 
       qusage.svn)
qusage.svn.all <- qusage.svn.all[match(names(module.geneset), qusage.svn.all$Module), ]
qusage.svn.all$Module <- factor(qusage.svn.all$Module, levels = qusage.svn.all$Module)

qusage.svn.melt <- melt(qusage.svn.all, id.vars = "Module")
qusage.svn.melt$prop <-apply(qusage.svn.melt, 1, function(prop){
  if (!is.na(prop[["value"]])){
  mod <- prop[[1]]
  day <- as.numeric(str_match(string = prop[2], pattern = "[0-9]"))
  geneset <- module.geneset[[mod]]
  num.genes <- length(geneset)
  deg.name <- grep(pattern = paste('^sbp\\.', day, '_vs_naive', sep='')  , x = names(deg.list), value = TRUE);
  deg <- deg.list[[deg.name]]
  deg <- deg[geneset, ] %>% drop_na()
  output <- nrow(deg) / num.genes} else {output <- NA}
  output
})

x <- ggplot(qusage.svn.melt, aes(x = variable, y = reorder(Module, desc(Module)))) + 
  ggtitle('Serially Blood Passaged') + xlab('Day') + ylab('Module') + 
  scale_x_discrete(labels= c(1,2,3,4,6)) +
  geom_count(aes(size = prop, colour = value), na.rm = TRUE) +
  labs(size = "Proportion of DEG in Module", colour = 'Enrichment score') +
  scale_color_gradientn(colours = rev(brewer.pal(11, "RdBu")),values = rescale(c(-0.5,0,1.25)), limits = c(-0.5,1.25)) +
  scale_size_area(max_size = 9) +
  theme_bw() +
  theme(panel.grid.minor = element_blank(),
  panel.background = element_rect(fill = 'white'),
  plot.background = element_blank(),
  legend.background = element_rect(fill = 'transparent'))

x

Results of the modular analysis of Serially Blood Passaged vs Control. Analysis was performed on all timepoint:module combinations and comparisons with a p-value < 0.05 were removed. The colour of the bubble indicated the intensity of fold change with red showing an upregulation and blue showing a downregulation. Size of the bubble indicates how many genes within a module were DE during that timepoint:module combination.

RTMT vs Naive

rvn.cols <- grepl(pattern = "naive|^rtmt", x= names(count.mat));
rvn.mat <- count.mat[, rvn.cols];

if(!exists("results.rvn")){
  results.rvn <- lapply(c(1,2,3,4,6), function(d){
    cols <- grepl(pattern = paste("d", d ,"|naive", sep=""), x = names(rvn.mat));
    dat <- rvn.mat[, cols]
    print(names(dat))
    labels <- sapply(names(dat), function(x){
      ifelse(grepl(pattern = "rtmt", x = x), "rtmt","naive")
    })
    print(labels)
    contrast = "rtmt-naive";
    qusage(dat, labels, contrast, module.geneset, n.points = 2^16);
  })
  names(results.rvn) <- c("Day1", "Day2", "Day3", "Day4", "Day6")
}

qusage.rvn <- lapply(names(results.rvn), function(x){
  df <- qsTable(results.rvn[[x]], number = 41)
  df$log.fold.change <- ifelse(df$p.Value < 0.05, df$log.fold.change, NA)
  df <- df[ ,c(1,2)]
  colnames(df) <- c("Module", paste(x, "_FC", sep = ""))
  df
})

qusage.rvn.all <- Reduce(function(x,y) merge(x = x, y = y, by = "Module", no.dups=T), 
       qusage.rvn)
qusage.rvn.all <- qusage.rvn.all[match(names(module.geneset), qusage.rvn.all$Module), ]
qusage.rvn.all$Module <- factor(qusage.rvn.all$Module, levels = qusage.rvn.all$Module)

qusage.rvn.melt <- melt(qusage.rvn.all, id.vars = "Module")
qusage.rvn.melt$prop <-apply(qusage.rvn.melt, 1, function(prop){
  if (!is.na(prop[["value"]])){
  mod <- prop[[1]]
  day <- as.numeric(str_match(string = prop[2], pattern = "[0-9]"))
  geneset <- module.geneset[[mod]]
  num.genes <- length(geneset)
  deg.name <- grep(pattern = paste('^rtmt\\.', day, '_vs_naive', sep='')  , x = names(deg.list), value = TRUE);
  deg <- deg.list[[deg.name]]
  deg <- deg[geneset, ] %>% drop_na()
  output <- nrow(deg) / num.genes} else {output <- NA}
  output
})

x <- ggplot(qusage.rvn.melt, aes(x = variable, y = reorder(Module, desc(Module)))) + 
  ggtitle('Recently Mosquito Transmitted') + xlab('Day') + ylab('Module') + 
  scale_x_discrete(labels= c(1,2,3,4,6)) +
  geom_count(aes(size = prop, colour = value), na.rm = TRUE) +
  labs(size = "Proportion of DEG in Module", colour = 'Enrichment score') +
  scale_color_gradientn(colours = rev(brewer.pal(11, "RdBu")),values = rescale(c(-0.5,0,1.25)), limits = c(-0.5,1.25)) +
  scale_size_area(max_size = 9) +
  theme_bw() +
  theme(panel.grid.minor = element_blank(),
  panel.background = element_rect(fill = 'white'),
  plot.background = element_blank(),
  legend.background = element_rect(fill = 'transparent'))

x

Results of the modular analysis of Recently Mosquito transmission vs Control. Analysis was performed on all timepoint:module combinations and comparisons with a p-value < 0.05 were removed. The colour of the bubble indicated the intensity of fold change with red showing an upregulation and blue showing a downregulation. Size of the bubble indicates how many genes within a module were DE during that timepoint:module combination.

Day 3 - All transmissions

qusage.mvn.melt$transmission <- "MT"
qusage.rvn.melt$transmission <- "RTMT"
qusage.svn.melt$transmission <- "SBP"

qusage.all.melt <- do.call("rbind", list(qusage.mvn.melt, qusage.rvn.melt , qusage.svn.melt))

qusage.all.d3 <- qusage.all.melt[qusage.all.melt$variable == "Day3_FC", ]

x <- ggplot(qusage.all.d3, aes(x = transmission, y = reorder(Module, desc(Module)))) + 
  ggtitle('Day 3 - All Transmission') + xlab('Transmission') + ylab('Module') + 
  geom_count(aes(size = prop, colour = value), na.rm = TRUE) +
  labs(size = "Proportion of DEG in Module", colour = 'Enrichment score') +
  scale_color_gradientn(colours = rev(brewer.pal(11, "RdBu")),values = rescale(c(-0.5,0,1.25)), limits = c(-0.5,1.25)) +
  scale_size_area(max_size = 9) +
  theme_bw() +
  theme(panel.grid.minor = element_blank(),
  panel.background = element_rect(fill = 'white'),
  plot.background = element_blank(),
  legend.background = element_rect(fill = 'transparent'))

x

Day 4 - All transmissions

qusage.all.d4 <- qusage.all.melt[qusage.all.melt$variable == "Day4_FC", ]

x <- ggplot(qusage.all.d4, aes(x = transmission, y = reorder(Module, desc(Module)))) + 
  ggtitle('Day 4 - All Transmission') + xlab('Transmission') + ylab('Module') + 
  geom_count(aes(size = prop, colour = value), na.rm = TRUE) +
  labs(size = "Proportion of DEG in Module", colour = 'Enrichment score') +
  scale_color_gradientn(colours = rev(brewer.pal(11, "RdBu")),values = rescale(c(-0.5,0,1.25)), limits = c(-0.5,1.25)) +
  scale_size_area(max_size = 9) +
  theme_bw() +
  theme(panel.grid.minor = element_blank(),
  panel.background = element_rect(fill = 'white'),
  plot.background = element_blank(),
  legend.background = element_rect(fill = 'transparent'))

x

Day 6 - All transmissions

qusage.all.d6 <- qusage.all.melt[qusage.all.melt$variable == "Day6_FC", ]

x <- ggplot(qusage.all.d6, aes(x = transmission, y = reorder(Module, desc(Module)))) + 
  ggtitle('Day 6 - All Transmission') + xlab('Transmission') + ylab('Module') + 
  geom_count(aes(size = prop, colour = value), na.rm = TRUE) +
  labs(size = "Proportion of DEG in Module", colour = 'Enrichment score') +
  scale_color_gradientn(colours = rev(brewer.pal(11, "RdBu")),values = rescale(c(-0.5,0,1.25)), limits = c(-0.5,1.25)) +
  scale_size_area(max_size = 9) +
  theme_bw() +
  theme(panel.grid.minor = element_blank(),
  panel.background = element_rect(fill = 'white'),
  plot.background = element_blank(),
  legend.background = element_rect(fill = 'transparent'))

x

if(!file.exists(paste(r.dir, "Projects/qusage_results.Rdata", sep = ""))){
save(results.mvn, results.svn, results.rvn, file = paste(r.dir, "Projects/qusage_results.Rdata", sep = ""))
}